# importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway, tukey_hsd
import numpy as np
import statsmodels.api as sm
from scipy import stats
What Makes a Popular Song?¶
Contributions:¶
Catherine Li¶
- A: Chose some datasets to work with, collaborated in filtering to the current dataset. Suggested working on popularity.
- C: Worked on Q1, Correlations
- D/E/F: Suggested algorithms to consider working on, worked on Neural Networks and K-Nearest Neighbors
- G: Wrote the introduction and insights/conclusion
Elizabeth Lim¶
- C: Helped graph box plots for popularity distributions between major and minor songs and between explicit and non-explicit songs
- D/E/F: Developed and trained Support Vector Regression and Linear Regression models to analyze which variables predicted popularity. Visualized both models using graphs to see the data and regression lines, and analyzed how well the models did.
Johann Paul¶
- C: Compared the distributions of major and minor keys with respect to popularity, as well as explicit and nonexplicit. Decided on a Man Whitney U test to test my hypothesis after observing a lack of normality from the qq-plot.
- D/E/F: Trained gradient boosted trees using the xgboost library to predict popularity of songs, and analyzed its performance. Visualized the distribution of prediction errors as well as the most important features.
- G: Helped edit the introduction and Primary Analysis
Nandhu Manoj Pillai¶
- B: Preprocessed dataset including encoding the genre category, cleaning missing data, removed duplicates.
- C: Worked on Q2 (How popularity was related to genre and key).
- D/E: Created RandomForestRegessor and RandomForestClassifer models and analysis.
- G: Compiled all group members' work, formatted jupyter notebook and markdown, created Github page.
I. 🎵 Introduction¶
Everyone listens to music 🎧
But have you ever wondered... what makes a song popular? 🤔
How can we predict a song’s success based on just a few key features?
So many elements go into making music —
🎼 Key
🎶 Tempo
🎻 Instrumentals
🎤 Vocals
🎧 Genre, and more!
People dedicate their careers — even their lives — to creating the next big hit 💿🔥
Wouldn't it be amazing if we could discover a formula for the perfect song? 🎯✨
In this tutorial, we’ll walk you through how we built a machine learning model 🤖
that predicts the popularity of any song using just a handful of features.
Let’s dive in!
Citation: Maharshi Pandya. (2022). 🎹 Spotify Tracks Dataset [Data set]. Kaggle. https://doi.org/10.34740/KAGGLE/DSV/4372070 This dataset contains Spotify tracks from over 100 different genres. It includes the names of the songs, artists, and many different song features such as danceability, energy, and acousticness. All these features are explained in more detail on the dataset webpage. We will use this dataset to help determine which features best predict song popularity.
II. Data Preprocessing¶
# Creating pandas dataframe of the Spotify dataset.
spotify_df = pd.read_csv('dataset.csv')
spotify_df = spotify_df.drop(columns='Unnamed: 0')
spotify_df
| track_id | artists | album_name | track_name | popularity | duration_ms | explicit | danceability | energy | key | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | time_signature | track_genre | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5SuOikwiRyPMVoIQDJUgSV | Gen Hoshino | Comedy | Comedy | 73 | 230666 | False | 0.676 | 0.4610 | 1 | -6.746 | 0 | 0.1430 | 0.0322 | 0.000001 | 0.3580 | 0.7150 | 87.917 | 4 | acoustic |
| 1 | 4qPNDBW1i3p13qLCt0Ki3A | Ben Woodward | Ghost (Acoustic) | Ghost - Acoustic | 55 | 149610 | False | 0.420 | 0.1660 | 1 | -17.235 | 1 | 0.0763 | 0.9240 | 0.000006 | 0.1010 | 0.2670 | 77.489 | 4 | acoustic |
| 2 | 1iJBSr7s7jYXzM8EGcbK5b | Ingrid Michaelson;ZAYN | To Begin Again | To Begin Again | 57 | 210826 | False | 0.438 | 0.3590 | 0 | -9.734 | 1 | 0.0557 | 0.2100 | 0.000000 | 0.1170 | 0.1200 | 76.332 | 4 | acoustic |
| 3 | 6lfxq3CG4xtTiEg7opyCyx | Kina Grannis | Crazy Rich Asians (Original Motion Picture Sou... | Can't Help Falling In Love | 71 | 201933 | False | 0.266 | 0.0596 | 0 | -18.515 | 1 | 0.0363 | 0.9050 | 0.000071 | 0.1320 | 0.1430 | 181.740 | 3 | acoustic |
| 4 | 5vjLSffimiIP26QG5WcN2K | Chord Overstreet | Hold On | Hold On | 82 | 198853 | False | 0.618 | 0.4430 | 2 | -9.681 | 1 | 0.0526 | 0.4690 | 0.000000 | 0.0829 | 0.1670 | 119.949 | 4 | acoustic |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 113995 | 2C3TZjDRiAzdyViavDJ217 | Rainy Lullaby | #mindfulness - Soft Rain for Mindful Meditatio... | Sleep My Little Boy | 21 | 384999 | False | 0.172 | 0.2350 | 5 | -16.393 | 1 | 0.0422 | 0.6400 | 0.928000 | 0.0863 | 0.0339 | 125.995 | 5 | world-music |
| 113996 | 1hIz5L4IB9hN3WRYPOCGPw | Rainy Lullaby | #mindfulness - Soft Rain for Mindful Meditatio... | Water Into Light | 22 | 385000 | False | 0.174 | 0.1170 | 0 | -18.318 | 0 | 0.0401 | 0.9940 | 0.976000 | 0.1050 | 0.0350 | 85.239 | 4 | world-music |
| 113997 | 6x8ZfSoqDjuNa5SVP5QjvX | Cesária Evora | Best Of | Miss Perfumado | 22 | 271466 | False | 0.629 | 0.3290 | 0 | -10.895 | 0 | 0.0420 | 0.8670 | 0.000000 | 0.0839 | 0.7430 | 132.378 | 4 | world-music |
| 113998 | 2e6sXL2bYv4bSz6VTdnfLs | Michael W. Smith | Change Your World | Friends | 41 | 283893 | False | 0.587 | 0.5060 | 7 | -10.889 | 1 | 0.0297 | 0.3810 | 0.000000 | 0.2700 | 0.4130 | 135.960 | 4 | world-music |
| 113999 | 2hETkH7cOfqmz3LqZDHZf5 | Cesária Evora | Miss Perfumado | Barbincor | 22 | 241826 | False | 0.526 | 0.4870 | 1 | -10.204 | 0 | 0.0725 | 0.6810 | 0.000000 | 0.0893 | 0.7080 | 79.198 | 4 | world-music |
114000 rows × 20 columns
spotify_df.head()
| track_id | artists | album_name | track_name | popularity | duration_ms | explicit | danceability | energy | key | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | time_signature | track_genre | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5SuOikwiRyPMVoIQDJUgSV | Gen Hoshino | Comedy | Comedy | 73 | 230666 | False | 0.676 | 0.4610 | 1 | -6.746 | 0 | 0.1430 | 0.0322 | 0.000001 | 0.3580 | 0.715 | 87.917 | 4 | acoustic |
| 1 | 4qPNDBW1i3p13qLCt0Ki3A | Ben Woodward | Ghost (Acoustic) | Ghost - Acoustic | 55 | 149610 | False | 0.420 | 0.1660 | 1 | -17.235 | 1 | 0.0763 | 0.9240 | 0.000006 | 0.1010 | 0.267 | 77.489 | 4 | acoustic |
| 2 | 1iJBSr7s7jYXzM8EGcbK5b | Ingrid Michaelson;ZAYN | To Begin Again | To Begin Again | 57 | 210826 | False | 0.438 | 0.3590 | 0 | -9.734 | 1 | 0.0557 | 0.2100 | 0.000000 | 0.1170 | 0.120 | 76.332 | 4 | acoustic |
| 3 | 6lfxq3CG4xtTiEg7opyCyx | Kina Grannis | Crazy Rich Asians (Original Motion Picture Sou... | Can't Help Falling In Love | 71 | 201933 | False | 0.266 | 0.0596 | 0 | -18.515 | 1 | 0.0363 | 0.9050 | 0.000071 | 0.1320 | 0.143 | 181.740 | 3 | acoustic |
| 4 | 5vjLSffimiIP26QG5WcN2K | Chord Overstreet | Hold On | Hold On | 82 | 198853 | False | 0.618 | 0.4430 | 2 | -9.681 | 1 | 0.0526 | 0.4690 | 0.000000 | 0.0829 | 0.167 | 119.949 | 4 | acoustic |
Let's learn more about the data! How many samples, features, data types, etc...
print('Number of samples:', spotify_df.shape[0])
print('Number of features:', spotify_df.shape[1])
print(f'Category Data Types:\n{spotify_df.dtypes}')
# Creating a list with all category names.
categories = spotify_df.columns.tolist()[1:]
print("Categories:", categories)
Number of samples: 114000 Number of features: 20 Category Data Types: track_id object artists object album_name object track_name object popularity int64 duration_ms int64 explicit bool danceability float64 energy float64 key int64 loudness float64 mode int64 speechiness float64 acousticness float64 instrumentalness float64 liveness float64 valence float64 tempo float64 time_signature int64 track_genre object dtype: object Categories: ['artists', 'album_name', 'track_name', 'popularity', 'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'track_genre']
Now let's look if there are any missing values!
missing_data = spotify_df.isna().sum()
print(missing_data)
# Saving missing values as dictionary as missing values for each category.
missing_data = missing_data.to_dict()
track_id 0 artists 1 album_name 1 track_name 1 popularity 0 duration_ms 0 explicit 0 danceability 0 energy 0 key 0 loudness 0 mode 0 speechiness 0 acousticness 0 instrumentalness 0 liveness 0 valence 0 tempo 0 time_signature 0 track_genre 0 dtype: int64
# Just making sure the count is the same for each column
spotify_df.count()
track_id 114000 artists 113999 album_name 113999 track_name 113999 popularity 114000 duration_ms 114000 explicit 114000 danceability 114000 energy 114000 key 114000 loudness 114000 mode 114000 speechiness 114000 acousticness 114000 instrumentalness 114000 liveness 114000 valence 114000 tempo 114000 time_signature 114000 track_genre 114000 dtype: int64
Looks like there is 1 missing value each in the artists, album_name, and track_name categories. Since
the proportion of missing data is <5% we can use Listwise Deletion to get rid of the missing values.
# Dropping the row with NA values in the dataframe.
spotify_df = spotify_df.dropna()
Now check for duplicate values in the dataset.
duplicates = spotify_df[spotify_df.duplicated()]
len(duplicates)
450
Looks like there are 450 rows with exact duplicates in the dataset that need to be removed.
spotify_df['track_id'].duplicated(keep=False).sum()
40900
duplicated_ids = spotify_df['track_id'].value_counts()
duplicated_ids = duplicated_ids[duplicated_ids > 1]
duplicated_ids
track_id
6S3JlDAGk3uu3NtZbPnuhS 9
2Ey6v4Sekh3Z0RUSISRosD 8
2kkvB3RNRzwjFdGhaUA0tz 8
08kTa3SL9sV6Iy8KLKtGql 7
4XYieGKSlJlHpzB3bl6WMP 7
..
2K6jJQ1i2SVNWLEF06Ha4B 2
0hjpo40L9XPirSdaJZOGB2 2
40XeGNGFchGYw7y0ue1GiG 2
50xwQXPtfNZFKFeZ0XePWc 2
6NDoBIaqTHdcudaR8RDJNw 2
Name: count, Length: 16641, dtype: int64
Digging further it can be seen that there are 16,641 tracks that are listed multiple times in the dataset since they fall under multiple genres. This information can come in handy later during model creation.
# Removing the exact duplicates from the dataset
spotify_df = spotify_df.drop_duplicates()
spotify_df.duplicated().sum()
0
The category, track_genre, is currently a string (object) describing what the genre of the particular song is.
We need to convert this to numeric values through feature encoding for our model to use later on.
# Checking cardinality to see what type of encoding would best fit this scenario.
print(spotify_df['track_genre'].nunique())
114
Since this category has a high cardinality, one hot encoding would create too many new columns.
Let's try using frequency encoding here to convert this column.
freq_encoding = spotify_df['track_genre'].value_counts()
spotify_df['track_genre_freq'] = spotify_df['track_genre'].map(freq_encoding)
spotify_df
| track_id | artists | album_name | track_name | popularity | duration_ms | explicit | danceability | energy | key | ... | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | time_signature | track_genre | track_genre_freq | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5SuOikwiRyPMVoIQDJUgSV | Gen Hoshino | Comedy | Comedy | 73 | 230666 | False | 0.676 | 0.4610 | 1 | ... | 0 | 0.1430 | 0.0322 | 0.000001 | 0.3580 | 0.7150 | 87.917 | 4 | acoustic | 1000 |
| 1 | 4qPNDBW1i3p13qLCt0Ki3A | Ben Woodward | Ghost (Acoustic) | Ghost - Acoustic | 55 | 149610 | False | 0.420 | 0.1660 | 1 | ... | 1 | 0.0763 | 0.9240 | 0.000006 | 0.1010 | 0.2670 | 77.489 | 4 | acoustic | 1000 |
| 2 | 1iJBSr7s7jYXzM8EGcbK5b | Ingrid Michaelson;ZAYN | To Begin Again | To Begin Again | 57 | 210826 | False | 0.438 | 0.3590 | 0 | ... | 1 | 0.0557 | 0.2100 | 0.000000 | 0.1170 | 0.1200 | 76.332 | 4 | acoustic | 1000 |
| 3 | 6lfxq3CG4xtTiEg7opyCyx | Kina Grannis | Crazy Rich Asians (Original Motion Picture Sou... | Can't Help Falling In Love | 71 | 201933 | False | 0.266 | 0.0596 | 0 | ... | 1 | 0.0363 | 0.9050 | 0.000071 | 0.1320 | 0.1430 | 181.740 | 3 | acoustic | 1000 |
| 4 | 5vjLSffimiIP26QG5WcN2K | Chord Overstreet | Hold On | Hold On | 82 | 198853 | False | 0.618 | 0.4430 | 2 | ... | 1 | 0.0526 | 0.4690 | 0.000000 | 0.0829 | 0.1670 | 119.949 | 4 | acoustic | 1000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 113995 | 2C3TZjDRiAzdyViavDJ217 | Rainy Lullaby | #mindfulness - Soft Rain for Mindful Meditatio... | Sleep My Little Boy | 21 | 384999 | False | 0.172 | 0.2350 | 5 | ... | 1 | 0.0422 | 0.6400 | 0.928000 | 0.0863 | 0.0339 | 125.995 | 5 | world-music | 999 |
| 113996 | 1hIz5L4IB9hN3WRYPOCGPw | Rainy Lullaby | #mindfulness - Soft Rain for Mindful Meditatio... | Water Into Light | 22 | 385000 | False | 0.174 | 0.1170 | 0 | ... | 0 | 0.0401 | 0.9940 | 0.976000 | 0.1050 | 0.0350 | 85.239 | 4 | world-music | 999 |
| 113997 | 6x8ZfSoqDjuNa5SVP5QjvX | Cesária Evora | Best Of | Miss Perfumado | 22 | 271466 | False | 0.629 | 0.3290 | 0 | ... | 0 | 0.0420 | 0.8670 | 0.000000 | 0.0839 | 0.7430 | 132.378 | 4 | world-music | 999 |
| 113998 | 2e6sXL2bYv4bSz6VTdnfLs | Michael W. Smith | Change Your World | Friends | 41 | 283893 | False | 0.587 | 0.5060 | 7 | ... | 1 | 0.0297 | 0.3810 | 0.000000 | 0.2700 | 0.4130 | 135.960 | 4 | world-music | 999 |
| 113999 | 2hETkH7cOfqmz3LqZDHZf5 | Cesária Evora | Miss Perfumado | Barbincor | 22 | 241826 | False | 0.526 | 0.4870 | 1 | ... | 0 | 0.0725 | 0.6810 | 0.000000 | 0.0893 | 0.7080 | 79.198 | 4 | world-music | 999 |
113549 rows × 21 columns
spotify_df['track_genre_freq'].unique()
array([1000, 999, 997, 998, 933, 995, 996, 965, 993, 963, 991,
981, 988, 990, 994, 992, 904], dtype=int64)
Looks like frequency encoding might not be the best option since looks the frequency of the genres are 999 or 1000 which loses much of the uniqueness and information we are looking for from this category. Another good option for encoding when there is high cardinality is Target/Mean Encoding, let's try that!
genre_popularity_map = spotify_df.groupby('track_genre')['popularity'].mean()
spotify_df['track_genre_target_enc'] = spotify_df['track_genre'].map(genre_popularity_map)
print('No. Unique Values: ', spotify_df['track_genre_target_enc'].nunique())
spotify_df
No. Unique Values: 113
| track_id | artists | album_name | track_name | popularity | duration_ms | explicit | danceability | energy | key | ... | speechiness | acousticness | instrumentalness | liveness | valence | tempo | time_signature | track_genre | track_genre_freq | track_genre_target_enc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5SuOikwiRyPMVoIQDJUgSV | Gen Hoshino | Comedy | Comedy | 73 | 230666 | False | 0.676 | 0.4610 | 1 | ... | 0.1430 | 0.0322 | 0.000001 | 0.3580 | 0.7150 | 87.917 | 4 | acoustic | 1000 | 42.483000 |
| 1 | 4qPNDBW1i3p13qLCt0Ki3A | Ben Woodward | Ghost (Acoustic) | Ghost - Acoustic | 55 | 149610 | False | 0.420 | 0.1660 | 1 | ... | 0.0763 | 0.9240 | 0.000006 | 0.1010 | 0.2670 | 77.489 | 4 | acoustic | 1000 | 42.483000 |
| 2 | 1iJBSr7s7jYXzM8EGcbK5b | Ingrid Michaelson;ZAYN | To Begin Again | To Begin Again | 57 | 210826 | False | 0.438 | 0.3590 | 0 | ... | 0.0557 | 0.2100 | 0.000000 | 0.1170 | 0.1200 | 76.332 | 4 | acoustic | 1000 | 42.483000 |
| 3 | 6lfxq3CG4xtTiEg7opyCyx | Kina Grannis | Crazy Rich Asians (Original Motion Picture Sou... | Can't Help Falling In Love | 71 | 201933 | False | 0.266 | 0.0596 | 0 | ... | 0.0363 | 0.9050 | 0.000071 | 0.1320 | 0.1430 | 181.740 | 3 | acoustic | 1000 | 42.483000 |
| 4 | 5vjLSffimiIP26QG5WcN2K | Chord Overstreet | Hold On | Hold On | 82 | 198853 | False | 0.618 | 0.4430 | 2 | ... | 0.0526 | 0.4690 | 0.000000 | 0.0829 | 0.1670 | 119.949 | 4 | acoustic | 1000 | 42.483000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 113995 | 2C3TZjDRiAzdyViavDJ217 | Rainy Lullaby | #mindfulness - Soft Rain for Mindful Meditatio... | Sleep My Little Boy | 21 | 384999 | False | 0.172 | 0.2350 | 5 | ... | 0.0422 | 0.6400 | 0.928000 | 0.0863 | 0.0339 | 125.995 | 5 | world-music | 999 | 41.880881 |
| 113996 | 1hIz5L4IB9hN3WRYPOCGPw | Rainy Lullaby | #mindfulness - Soft Rain for Mindful Meditatio... | Water Into Light | 22 | 385000 | False | 0.174 | 0.1170 | 0 | ... | 0.0401 | 0.9940 | 0.976000 | 0.1050 | 0.0350 | 85.239 | 4 | world-music | 999 | 41.880881 |
| 113997 | 6x8ZfSoqDjuNa5SVP5QjvX | Cesária Evora | Best Of | Miss Perfumado | 22 | 271466 | False | 0.629 | 0.3290 | 0 | ... | 0.0420 | 0.8670 | 0.000000 | 0.0839 | 0.7430 | 132.378 | 4 | world-music | 999 | 41.880881 |
| 113998 | 2e6sXL2bYv4bSz6VTdnfLs | Michael W. Smith | Change Your World | Friends | 41 | 283893 | False | 0.587 | 0.5060 | 7 | ... | 0.0297 | 0.3810 | 0.000000 | 0.2700 | 0.4130 | 135.960 | 4 | world-music | 999 | 41.880881 |
| 113999 | 2hETkH7cOfqmz3LqZDHZf5 | Cesária Evora | Miss Perfumado | Barbincor | 22 | 241826 | False | 0.526 | 0.4870 | 1 | ... | 0.0725 | 0.6810 | 0.000000 | 0.0893 | 0.7080 | 79.198 | 4 | world-music | 999 | 41.880881 |
113549 rows × 22 columns
III. Exploratory Data Analysis¶
🎯 Q1: Correlations¶
We will start our analysis by determining how each feature is related to each other. To do this, we will create a plot of the correlations between each numerical feature.
#Credit: https://www.geeksforgeeks.org/data-analysis/exploring-correlation-in-python/
corr = spotify_df.corr(method = 'pearson', numeric_only= True)
plt.figure(figsize=(10,8), dpi =500)
plt.title("Correlations between Features")
ax = sns.heatmap(corr,annot=True,fmt=".2f", linewidth=.5)
ax.set_xlabel("Features")
ax.set_ylabel("Features")
plt.show()
We can see from the above plot that some features are more correlated, such as acousticness and energy, as well as acousticness and loudness. However, most features are not highly correlated. This is easier to see when we plot the absolute value of the correlation.
plt.figure(figsize=(10,8), dpi =500)
plt.title("Correlations between Features")
ax = sns.heatmap(corr.abs(),annot=True,fmt=".2f", linewidth=.5)
ax.set_xlabel("Features")
ax.set_ylabel("Features")
plt.show()
Most of the pairs of features have absolute correlation less than 0.1. To expand on this, let's find out which features are most correlated to which.
pairs = list()
for column in corr.columns:
related_feature = corr[column].abs().sort_values(ascending= False).iloc[[1]].index.array[0]
pairs.append([column, related_feature])
print(column, " is most correlated with ", related_feature)
popularity is most correlated with track_genre_target_enc duration_ms is most correlated with valence explicit is most correlated with speechiness danceability is most correlated with valence energy is most correlated with loudness key is most correlated with mode loudness is most correlated with energy mode is most correlated with key speechiness is most correlated with explicit acousticness is most correlated with energy instrumentalness is most correlated with loudness liveness is most correlated with speechiness valence is most correlated with danceability tempo is most correlated with energy time_signature is most correlated with danceability track_genre_freq is most correlated with track_genre_target_enc track_genre_target_enc is most correlated with popularity
Some interesting points come from this:
- the duration of a song tends to be longer when the valence ("musical positiveness") is lower,
- in contrast the danceability tends to be higher when the valence is higher,
- energy (intensity) is highly correlated with loudness and acousticness,
- the tempo tends to be faster when the energy is higher.
Above all however, is that popularity is not highly correlated with any feature except for potentially track genre.
🎯 Q2: Is popularity related to genre/key?¶
🔬 Statistical Method: One-way ANOVA (Analysis of Variance)
# Create a list of popularity scores per genre
genre_groups = [group['popularity'].values for name, group in spotify_df.groupby('track_genre')]
F = f_oneway(*genre_groups)
print("P-Value received from the ANOVA-testing", F.pvalue)
P-Value received from the ANOVA-testing 0.0
This p-value tells us that there is there is strong evidence that at least one genre has a different mean popularity compared to the others since it is <0.05
From doing the ANOVA test, we can only learn that there is a significant difference between the means of the groups included. Now to find out which one of these groups differ, we must do a Post Hoc Test such as Tukey's Honest Significant Difference (HSD). Here I will do the HSD test on the 10 most popular genre's since doing this test on 114 genres will become overwhelming.
# Find top 10 genres by mean popularity
top_10_popular_genres = (
spotify_df.groupby('track_genre')['popularity']
.mean()
.sort_values(ascending=False)
.head(10)
.index
)
# Filter dataframe
df_top10 = spotify_df[spotify_df['track_genre'].isin(top_10_popular_genres)]
# Convert popularity values into a list per genre
groups = [df_top10[df_top10['track_genre'] == genre]['popularity'].values
for genre in top_10_popular_genres]
# Run Tukey HSD
res = tukey_hsd(*groups)
We are going to make the result of the HSD into a pandas dataframe to make it easier to view.
genre_list = list(top_10_popular_genres)
comparisons = []
for (i, j), stat in np.ndenumerate(res.statistic):
if i < j:
comparisons.append({
'Genre A': genre_list[i],
'Genre B': genre_list[j],
'Mean Diff': stat,
'p-value': res.pvalue[i, j],
'Lower CI': res.confidence_interval().low[i, j],
'Upper CI': res.confidence_interval().high[i, j],
'Significant': res.pvalue[i, j] < 0.05
})
# Convert to DataFrame
tukey_df = pd.DataFrame(comparisons)
tukey_df = tukey_df.sort_values(by='p-value')
tukey_df
| Genre A | Genre B | Mean Diff | p-value | Lower CI | Upper CI | Significant | |
|---|---|---|---|---|---|---|---|
| 22 | chill | pop | 5.801381 | 0.000000e+00 | 3.418556 | 8.184207 | True |
| 23 | chill | sertanejo | 5.838705 | 0.000000e+00 | 3.460066 | 8.217344 | True |
| 16 | k-pop | sertanejo | 9.097928 | 0.000000e+00 | 6.718693 | 11.477163 | True |
| 15 | k-pop | pop | 9.060605 | 0.000000e+00 | 6.677184 | 11.444025 | True |
| 14 | k-pop | emo | 8.835928 | 0.000000e+00 | 6.456693 | 11.215163 | True |
| 12 | k-pop | indian | 7.435399 | 0.000000e+00 | 5.055569 | 9.815229 | True |
| 11 | k-pop | grunge | 7.381345 | 0.000000e+00 | 5.001515 | 9.761175 | True |
| 8 | pop-film | sertanejo | 11.414280 | 0.000000e+00 | 9.035641 | 13.792919 | True |
| 13 | k-pop | anime | 8.197161 | 0.000000e+00 | 5.817331 | 10.576991 | True |
| 6 | pop-film | emo | 11.152280 | 0.000000e+00 | 8.773641 | 13.530919 | True |
| 5 | pop-film | anime | 10.513514 | 0.000000e+00 | 8.134280 | 12.892747 | True |
| 4 | pop-film | indian | 9.751752 | 0.000000e+00 | 7.372518 | 12.130986 | True |
| 3 | pop-film | grunge | 9.697698 | 0.000000e+00 | 7.318464 | 12.076932 | True |
| 2 | pop-film | sad | 6.901280 | 0.000000e+00 | 4.522641 | 9.279919 | True |
| 7 | pop-film | pop | 11.376957 | 0.000000e+00 | 8.994132 | 13.759782 | True |
| 21 | chill | emo | 5.576705 | 4.717338e-12 | 3.198066 | 7.955344 | True |
| 1 | pop-film | chill | 5.575576 | 4.863887e-12 | 3.196342 | 7.954810 | True |
| 20 | chill | anime | 4.937938 | 2.413916e-09 | 2.558704 | 7.317172 | True |
| 10 | k-pop | sad | 4.584928 | 4.999529e-08 | 2.205693 | 6.964163 | True |
| 29 | sad | sertanejo | 4.513000 | 8.863988e-08 | 2.134956 | 6.891044 | True |
| 28 | sad | pop | 4.475677 | 1.279613e-07 | 2.093445 | 6.857908 | True |
| 27 | sad | emo | 4.251000 | 7.089971e-07 | 1.872956 | 6.629044 | True |
| 19 | chill | indian | 4.176176 | 1.276040e-06 | 1.796942 | 6.555410 | True |
| 18 | chill | grunge | 4.122122 | 1.916241e-06 | 1.742888 | 6.501356 | True |
| 26 | sad | anime | 3.612233 | 6.834448e-05 | 1.233594 | 5.990872 | True |
| 9 | k-pop | chill | 3.259223 | 6.234850e-04 | 0.879393 | 5.639053 | True |
| 25 | sad | indian | 2.850471 | 5.832857e-03 | 0.471832 | 5.229111 | True |
| 24 | sad | grunge | 2.796417 | 7.650167e-03 | 0.417778 | 5.175057 | True |
| 0 | pop-film | k-pop | 2.316352 | 6.415646e-02 | -0.063477 | 4.696182 | False |
| 34 | grunge | sertanejo | 1.716583 | 3.997287e-01 | -0.662057 | 4.095222 | False |
| 33 | grunge | pop | 1.679259 | 4.359465e-01 | -0.703566 | 4.062085 | False |
| 38 | indian | sertanejo | 1.662529 | 4.485840e-01 | -0.716111 | 4.041168 | False |
| 37 | indian | pop | 1.625205 | 4.859952e-01 | -0.757620 | 4.008031 | False |
| 32 | grunge | emo | 1.454583 | 6.448710e-01 | -0.924057 | 3.833222 | False |
| 36 | indian | emo | 1.400529 | 6.941382e-01 | -0.978111 | 3.779168 | False |
| 17 | chill | sad | 1.325705 | 7.582107e-01 | -1.052934 | 3.704344 | False |
| 41 | anime | sertanejo | 0.900767 | 9.727925e-01 | -1.477872 | 3.279406 | False |
| 40 | anime | pop | 0.863444 | 9.797852e-01 | -1.519382 | 3.246269 | False |
| 31 | grunge | anime | 0.815816 | 9.862179e-01 | -1.563418 | 3.195050 | False |
| 35 | indian | anime | 0.761762 | 9.915623e-01 | -1.617472 | 3.140996 | False |
| 39 | anime | emo | 0.638767 | 9.977545e-01 | -1.739872 | 3.017406 | False |
| 43 | emo | sertanejo | 0.262000 | 9.999987e-01 | -2.116044 | 2.640044 | False |
| 42 | emo | pop | 0.224677 | 9.999997e-01 | -2.157555 | 2.606908 | False |
| 30 | grunge | indian | 0.054054 | 1.000000e+00 | -2.325180 | 2.433288 | False |
| 44 | pop | sertanejo | 0.037323 | 1.000000e+00 | -2.344908 | 2.419555 | False |
From this result of the HSD test and it's table, you can see that the categories pop-film, film, and chill were the majority of the significant difference when comparisons where made between genres. This suggests that pop-film, k-pop, and chill are are significantly more popular than emo, anime, indian, grunge, sertanejo, sad, and pop as the conclusion.
Below I've plotted the Top 10 Most Popular Genres to show that this relationship found can also be seen from a simple averaging of popularity scores on the track_genres.
# Compute mean popularity
mean_popularity = df_top10.groupby('track_genre')['popularity'].mean().sort_values(ascending=False)
plt.figure(figsize=(12,6))
sns.barplot(x=mean_popularity.index, y=mean_popularity.values, palette="viridis")
plt.title('Top 10 Most Popular Genres by Average Popularity')
plt.ylabel('Mean Popularity')
plt.xlabel('Genre')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
/var/folders/tv/89ds0n_563nfsn6bpr8c6fv00000gn/T/ipykernel_43264/3331727488.py:5: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=mean_popularity.index, y=mean_popularity.values, palette="viridis")
🎯 Q3: Are songs in a major key more popular than songs in a minor key? How about explicit vs non-explicit?¶
Let's separate the songs between those that are in major vs minor key.
songs_in_minor_keys = spotify_df[spotify_df["mode"] == 0]
songs_in_major_keys = spotify_df[spotify_df["mode"] == 1]
songs_in_minor_keys.head()
| track_id | artists | album_name | track_name | popularity | duration_ms | explicit | danceability | energy | key | ... | speechiness | acousticness | instrumentalness | liveness | valence | tempo | time_signature | track_genre | track_genre_freq | track_genre_target_enc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5SuOikwiRyPMVoIQDJUgSV | Gen Hoshino | Comedy | Comedy | 73 | 230666 | False | 0.676 | 0.4610 | 1 | ... | 0.1430 | 0.0322 | 0.000001 | 0.3580 | 0.715 | 87.917 | 4 | acoustic | 1000 | 42.483 |
| 12 | 4ptDJbJl35d7gQfeNteBwp | Dan Berk | Solo | Solo | 52 | 198712 | False | 0.489 | 0.3140 | 7 | ... | 0.0331 | 0.7490 | 0.000000 | 0.1130 | 0.607 | 124.234 | 4 | acoustic | 1000 | 42.483 |
| 17 | 4Yo0igmcoNyat1secaH0OD | Andrew Foy;Renee Foy | At My Worst | At My Worst | 54 | 169728 | False | 0.795 | 0.0841 | 10 | ... | 0.0461 | 0.7420 | 0.000012 | 0.0853 | 0.609 | 91.803 | 4 | acoustic | 1000 | 42.483 |
| 24 | 3Hn3LfhrQOaKihdCibJsTs | Jason Mraz | Human - Best Adult Pop Tunes | Unlonely | 0 | 231266 | False | 0.796 | 0.6670 | 5 | ... | 0.0392 | 0.3810 | 0.000000 | 0.2210 | 0.754 | 97.988 | 4 | acoustic | 1000 | 42.483 |
| 25 | 6D33wCKzWtNEgOovgeVJ7r | Jason Mraz | Mellow Adult Pop | Bella Luna | 1 | 302346 | False | 0.755 | 0.4540 | 9 | ... | 0.0352 | 0.7570 | 0.000000 | 0.2360 | 0.330 | 120.060 | 4 | acoustic | 1000 | 42.483 |
5 rows × 22 columns
songs_in_major_keys.head()
| track_id | artists | album_name | track_name | popularity | duration_ms | explicit | danceability | energy | key | ... | speechiness | acousticness | instrumentalness | liveness | valence | tempo | time_signature | track_genre | track_genre_freq | track_genre_target_enc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 4qPNDBW1i3p13qLCt0Ki3A | Ben Woodward | Ghost (Acoustic) | Ghost - Acoustic | 55 | 149610 | False | 0.420 | 0.1660 | 1 | ... | 0.0763 | 0.924 | 0.000006 | 0.1010 | 0.267 | 77.489 | 4 | acoustic | 1000 | 42.483 |
| 2 | 1iJBSr7s7jYXzM8EGcbK5b | Ingrid Michaelson;ZAYN | To Begin Again | To Begin Again | 57 | 210826 | False | 0.438 | 0.3590 | 0 | ... | 0.0557 | 0.210 | 0.000000 | 0.1170 | 0.120 | 76.332 | 4 | acoustic | 1000 | 42.483 |
| 3 | 6lfxq3CG4xtTiEg7opyCyx | Kina Grannis | Crazy Rich Asians (Original Motion Picture Sou... | Can't Help Falling In Love | 71 | 201933 | False | 0.266 | 0.0596 | 0 | ... | 0.0363 | 0.905 | 0.000071 | 0.1320 | 0.143 | 181.740 | 3 | acoustic | 1000 | 42.483 |
| 4 | 5vjLSffimiIP26QG5WcN2K | Chord Overstreet | Hold On | Hold On | 82 | 198853 | False | 0.618 | 0.4430 | 2 | ... | 0.0526 | 0.469 | 0.000000 | 0.0829 | 0.167 | 119.949 | 4 | acoustic | 1000 | 42.483 |
| 5 | 01MVOl9KtVTNfFiBU9I7dc | Tyrone Wells | Days I Will Remember | Days I Will Remember | 58 | 214240 | False | 0.688 | 0.4810 | 6 | ... | 0.1050 | 0.289 | 0.000000 | 0.1890 | 0.666 | 98.017 | 4 | acoustic | 1000 | 42.483 |
5 rows × 22 columns
Let's look at the distribution of the popularity of the songs in minor keys.
songs_in_minor_keys["popularity"].plot(kind = 'hist')
plt.show()
Let's look at the distribution of the popularity of the songs in major keys.
songs_in_major_keys["popularity"].plot(kind = 'hist')
plt.show()
These qqplots (Quantile-Quantile Plots) confirm that the distribution of the popularity for both major and minor key songs are not normal. If the distributions were normal, then the scatter would fit close to the line shown below, however, we can see some deviation on the ends of the theoretical quantiles, which suggests a more "dispersed" distribution. This aligns with what we see in the histograms.
#Credit: https://www.geeksforgeeks.org/machine-learning/quantile-quantile-plots/
stats.probplot(songs_in_minor_keys['popularity'], dist="norm", plot=plt)
plt.title('Normal Q-Q plot')
plt.xlabel('Theoretical quantiles')
plt.ylabel('Ordered Values')
plt.grid(True)
plt.show()
#fig = sm.qqplot(songs_in_minor_keys['popularity'], line='45')
#Credit: https://www.geeksforgeeks.org/machine-learning/quantile-quantile-plots/
stats.probplot(songs_in_major_keys['popularity'], dist="norm", plot=plt)
plt.title('Normal Q-Q plot')
plt.xlabel('Theoretical quantiles')
plt.ylabel('Ordered Values')
plt.grid(True)
plt.show()
#fig = sm.qqplot(songs_in_major_keys['popularity'], line='45')
Note that from the above results it is clear that these two populations are not normal. Hence we must use the Man Whiteny U test instead.
H_0: The popularity in major key and minor key come from the same distribution.
H_a: The popularity in major key and minor key come from the different distributions.
major_key_popularities = songs_in_major_keys["popularity"]
minor_key_popularities = songs_in_minor_keys["popularity"]
stat, p_val = stats.mannwhitneyu(major_key_popularities, minor_key_popularities)
print("The p-value is", p_val, ".")
The p-value is 1.593987121786141e-07 .
if p_val < 0.01:
print("Reject the null hypothesis: There is a statistically significant difference in the distribution of popularity between the major key and minor keys.")
else:
print("Fail to reject the null hypothesis: No statistically significant difference in the distribution of popularity between the two groups.")
Reject the null hypothesis: There is a statistically significant difference in the distribution of popularity between the major key and minor keys.
The above Man Whiteney U test suggests that there is sufficient evidence to reject the null hypothesis that the distributions of popularity between major and minor keys is the same. Instead, we conclude that the distributions must be different: thus the key likely has an influence on popularity.
float(major_key_popularities.mean())
33.0724295516989
float(minor_key_popularities.mean())
33.76831225680934
We see that the means from both distributions are quite similar, despite being from different distributions.
# Get the popularity means for explicit and nonexplicit songs
mean_major = major_key_popularities.mean()
mean_minor = minor_key_popularities.mean()
# Group songs by explicit/non-explicit status and show popularity
grouped = spotify_df.groupby('mode')['popularity']
###############################################################################
# BOXPLOT GRAPH
# Credit to: https://python-graph-gallery.com/551-student-t-test-visualization/
# Init a figure and axes
fig, ax = plt.subplots(figsize=(8, 6))
# Create the plot with different colors for each group
boxplot = ax.boxplot(x=[group.values for name, group in grouped],
labels=["Major", "Minor"],
patch_artist=True,
medianprops={'color': 'black'}
)
# Define colors for each group
colors = ['red', 'blue']
# Assign colors to each box in the boxplot
for box, color in zip(boxplot['boxes'], colors):
box.set_facecolor(color)
# Add the mean for each group
ax.text(1.1, mean_major, f'Major: {mean_major:.2f}')
ax.text(1.64, mean_minor, f'Minor: {mean_minor:.2f}')
# Add a title and axis label
ax.set_title('Popularity Distribution between Major and Minor Songs')
ax.set_ylabel('Popularity')
# Display it
plt.show()
/var/folders/tv/89ds0n_563nfsn6bpr8c6fv00000gn/T/ipykernel_43264/3213480038.py:16: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11. boxplot = ax.boxplot(x=[group.values for name, group in grouped],
Now let's look at the difference in popularity between explicit and non-explicit songs.¶
explicit_songs = spotify_df[spotify_df["explicit"] == True]
non_explicit_songs = spotify_df[spotify_df["explicit"] == False]
explicit_songs["popularity"].plot(kind = 'hist')
plt.show()
non_explicit_songs["popularity"].plot(kind = 'hist')
plt.show()
Similar to the major/minor popularity distributions, these do not have a normal bell-shape either. Let's look at the Q-Q Plots.
#Credit: https://www.geeksforgeeks.org/machine-learning/quantile-quantile-plots/
stats.probplot(explicit_songs['popularity'], dist="norm", plot=plt)
plt.title('Normal Q-Q plot')
plt.xlabel('Theoretical quantiles')
plt.ylabel('Ordered Values')
plt.grid(True)
plt.show()
#explicit_songs_popularity_distr = sm.qqplot(explicit_songs['popularity'], line='45')
#Credit: https://www.geeksforgeeks.org/machine-learning/quantile-quantile-plots/
stats.probplot(non_explicit_songs['popularity'], dist="norm", plot=plt)
plt.title('Normal Q-Q plot')
plt.xlabel('Theoretical quantiles')
plt.ylabel('Ordered Values')
plt.grid(True)
plt.show()
#nonexplicit_songs_popularity_dist = sm.qqplot(non_explicit_songs['popularity'], line='45')
Note that from the above results it is clear that these two populations are not normal. Hence we must use the Man Whiteny U test instead.
H_0: The popularity of explicit and non-explicit songs come from the same distribution.
H_a: The popularity of explicit and non-explicit songs come from different distributions.
explicit_song_popularities = explicit_songs["popularity"]
non_explicit_song_popularities = non_explicit_songs["popularity"]
stat_e, p_val_e = stats.mannwhitneyu(explicit_song_popularities, non_explicit_song_popularities)
print("The p-value is", p_val_e, ".")
The p-value is 1.3317030463864243e-41 .
if p_val_e < 0.01:
print("Reject the null hypothesis: There is a statistically significant difference in the distribution of popularity between explicit songs and non-explicit songs.")
else:
print("Fail to reject the null hypothesis: No statistically significant difference in the distribution of popularity between between explicit songs and non-explicit songs.")
Reject the null hypothesis: There is a statistically significant difference in the distribution of popularity between explicit songs and non-explicit songs.
# Get the popularity means for explicit and nonexplicit songs
mean_explicit = explicit_song_popularities.mean()
mean_non_explicit = non_explicit_song_popularities.mean()
# Group songs by explicit/non-explicit status and show popularity
grouped = spotify_df.groupby('explicit')['popularity']
###############################################################################
# BOXPLOT GRAPH
# Credit to: https://python-graph-gallery.com/551-student-t-test-visualization/
# Init a figure and axes
fig, ax = plt.subplots(figsize=(8, 6))
# Create the plot with different colors for each group
boxplot = ax.boxplot(x=[group.values for name, group in grouped],
labels=["Explicit", "Non-Explicit"],
patch_artist=True,
medianprops={'color': 'black'}
)
# Define colors for each group
colors = ['red', 'blue']
# Assign colors to each box in the boxplot
for box, color in zip(boxplot['boxes'], colors):
box.set_facecolor(color)
# Add the mean for each group
ax.text(1.1, mean_explicit, f'Mean: {mean_explicit:.2f}')
ax.text(1.64, mean_non_explicit, f'Mean: {mean_non_explicit:.2f}')
# Add a title and axis label
ax.set_title('Popularity Distribution between Explicit and Non-Explicit Songs')
ax.set_ylabel('Popularity')
# Display it
plt.show()
/var/folders/tv/89ds0n_563nfsn6bpr8c6fv00000gn/T/ipykernel_43264/4160170839.py:16: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11. boxplot = ax.boxplot(x=[group.values for name, group in grouped],
IV. Machine Learning Analysis¶
After testing multiple model LinearRegression, SVR, RandomForestRegressor, KNN, Neural Network, XGBoost to predict the popularity of a song. The best results came from the KNN model which received a score of 12.16 on the RMSE test and a 0.70 R squared value. Therefore, the ML analysis will only guide you through our work from the K-NN, but the work the team did on the other models is attached at the bottom of the notebook for reference. Another approach we decided to do after a underwhelming results from the regression model was to take a classification approach. The songs were categorized into 3 categories (low, medium, and high popularity) then a RandomForstClassifier was used.
K-Nearest Neighbors¶
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
artists_popularity_map = spotify_df.groupby('artists')['popularity'].mean()
spotify_df['artists_target_enc'] = spotify_df['artists'].map(artists_popularity_map)
#encode explicit as 1 or 0
spotify_df['explicit_enc'] = spotify_df['explicit'].map(lambda x: 1 if x else 0)
#drop non-numeric columns
X = spotify_df.drop(columns=['popularity', 'artists', 'track_id', 'album_name', 'track_name', 'explicit', 'track_genre', 'track_genre_freq'])
y = spotify_df['popularity']
display(X.head())
display(y.head())
display(X.dtypes)
display(y.dtypes)
#Convert type of data into floats
X = X.astype('float32')
y = y.astype('float32')
display(X.dtypes)
display(y.dtypes)
random_state= 42
#Credit: HW4
#split into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
#Normalize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train)
y_train_tensor = torch.tensor(y_train.values)
X_test_tensor = torch.tensor(X_test)
y_test_tensor = torch.tensor(y_test.values)
# Create data loaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
batch_size = train_loader.batch_size
num_train_data = len(train_loader) * batch_size
num_test_data = len(test_loader) * batch_size
feat_dim = train_loader.dataset[0][0].shape[0]
We must first determine the best value for k using cross validation.
#find the best cluster size
def knn(cluster_size):
neigh = KNeighborsRegressor(n_neighbors=cluster_size, weights= 'distance')
neigh.fit(X_train, y_train)
return cross_val_score(neigh, X_train, y_train, cv= 5).mean() #5 folds
scores = []
cluster_sizes = []
for i in range(5, 70, 5):
scores.append(knn(i))
cluster_sizes.append(i)
plt.plot(cluster_sizes, scores)
[<matplotlib.lines.Line2D at 0x12918a5d0>]
It seems like the best cluster size is 25 with score ~0.7, after which there is minimal benefit of increasing size.
## define code for visualizing the results
def ml_visual(sample, target, preds):
sample_df = pd.DataFrame(sample, columns= X.columns)
target_df = pd.DataFrame(target, columns= ['popularity'])
preds_df = pd.DataFrame(preds, columns= ['predicted_popularity'])
sample_df = sample_df.merge(target_df, left_index=True, right_index=True)
sample_df = sample_df.merge(preds_df, left_index=True, right_index=True)
display(sample_df.head(10))
plt.scatter(target, preds)
plt.xlabel('actual popularity')
plt.ylabel('predicted popularity')
plt.plot(target, target)
plt.show()
neigh = KNeighborsRegressor(n_neighbors=25, weights= 'distance')
neigh.fit(X_train, y_train)
preds = neigh.predict(X_test)
sample = X_test
target = y_test
ml_visual(sample, target, preds)
| duration_ms | danceability | energy | key | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | time_signature | track_genre_target_enc | artists_target_enc | explicit_enc | popularity | predicted_popularity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | 0.532040 | -1.643113 | -1.988699 | -0.367280 | -2.160372 | -1.335451 | -0.482557 | 1.993330 | 2.354289 | -0.580892 | -1.038870 | 0.441346 | -2.062984 | -0.374213 | -0.559937 | -0.30251 | 71.0 | 29.293007 |
| 23 | 0.437469 | -0.294569 | 0.871115 | -1.492911 | 0.025093 | 0.748811 | -0.182322 | -0.947184 | -0.494433 | -0.136190 | -0.510412 | 0.462796 | 0.225414 | -0.096352 | -0.286441 | 3.30568 | 0.0 | 30.457305 |
| 24 | -0.098566 | 1.503490 | -0.326109 | 1.039759 | 0.395359 | -1.335451 | -0.215040 | -0.813734 | -0.503743 | -0.844537 | 1.063391 | 0.730004 | 0.225414 | 0.659819 | 0.024792 | -0.30251 | 0.0 | 32.639921 |
| 31 | -1.109964 | -1.608535 | 1.161472 | -0.930096 | 0.289342 | 0.748811 | -0.442141 | -0.922008 | -0.503747 | 0.620862 | 1.456842 | 0.507872 | 0.225414 | -0.564724 | -0.435519 | -0.30251 | 0.0 | 26.666537 |
| 35 | 0.618803 | -0.536615 | 0.015955 | -0.930096 | 0.679660 | 0.748811 | -0.226588 | 0.964154 | -0.503755 | 2.987311 | 0.137625 | 1.255052 | 0.225414 | 1.301665 | 0.732818 | -0.30251 | 0.0 | 47.323516 |
| 39 | -0.640120 | -1.504800 | 0.791565 | 1.039759 | -0.174037 | -1.335451 | -0.171737 | -0.948736 | 0.224512 | -0.146779 | 0.909097 | 0.960135 | 0.225414 | -0.387087 | -0.537519 | 3.30568 | 0.0 | 30.326756 |
| 44 | -0.153820 | 0.690906 | 1.089877 | 1.321167 | 1.089435 | -1.335451 | -0.371894 | -0.712020 | -0.502865 | -0.559716 | 1.032532 | -0.741700 | 0.225414 | -1.125621 | -1.740603 | -0.30251 | 0.0 | 0.035403 |
| 53 | -0.322118 | 0.541068 | 0.680196 | -0.930096 | 0.118801 | 0.748811 | -0.021620 | -0.939341 | -0.496246 | 0.679096 | 1.194541 | 1.099043 | 0.225414 | -1.169991 | 0.084946 | -0.30251 | 68.0 | 23.561989 |
| 61 | 0.868382 | 1.941479 | 0.513141 | -1.211504 | 0.467824 | 0.748811 | -0.339176 | -0.914545 | 1.671336 | -0.480305 | 1.371980 | -0.105520 | 0.225414 | 0.110436 | -0.799059 | -0.30251 | 62.0 | 17.432511 |
| 70 | -0.102579 | -0.721031 | -0.990349 | 0.195536 | -0.701542 | 0.748811 | -0.503727 | -0.034930 | -0.503669 | 3.903185 | -0.440979 | -1.376575 | -2.062984 | -1.501638 | -0.874302 | -0.30251 | 55.0 | 20.191181 |
This scatter looks much closer to the desired y = x line than the neural network. What's also interesting to note is that songs with 0 actual popularity have high variety in predicted popularity, which suggests that many of those songs have the characteristics of highly popular songs, according to this algorithm.
print("Coefficient of Determination", neigh.score(X_test, y_test))
Coefficient of Determination 0.702356448358433
As we can see, the coefficient of determination is also much better than the Neural Network approach. This suggests that finding songs with similar characteristics is a better indicator of popularity.
print("Coefficient of Determination", neigh.score(X_test, y_test))
print("RMSE ", ((-1) * (neigh.score(X_test, y_test) - 1) * spotify_df['popularity'].var()) ** 0.5)
Coefficient of Determination 0.702356448358433 RMSE 12.157339887336805
Random Forest¶
Another model option that we can try for our popularity prediction is a Random Forest model to it's interprebility.
Additional Reading on Random Forest & Resources:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
Looking back at our dataset we need to first identify which categories (columns) would give useful and usable information for our RandomForest model. Looking at the Spotify dataset these look like good candidates:
- duration_ms
- explicit
- danceability
- energy
- key
- loudness
- mode
- speechiness
- acousticness
- instrumentalness
- liveness
- valence
- tempo
- time_signature
- track_genre_target_enc
spotify_df.dtypes
track_id object artists object album_name object track_name object popularity int64 duration_ms int64 explicit bool danceability float64 energy float64 key int64 loudness float64 mode int64 speechiness float64 acousticness float64 instrumentalness float64 liveness float64 valence float64 tempo float64 time_signature int64 track_genre object track_genre_freq int64 track_genre_target_enc float64 dtype: object
# Select features and target
feature_cols = [
'duration_ms', 'explicit', 'danceability', 'energy', 'key',
'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
'liveness', 'valence', 'tempo', 'time_signature', 'track_genre_target_enc'
]
# Extracting data into X and y
X = spotify_df[feature_cols]
y = spotify_df['popularity']
# Convert boolean to integer if needed
X['explicit'] = X['explicit'].astype(int)
C:\Users\mail2\AppData\Local\Temp\ipykernel_21656\738643166.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X['explicit'] = X['explicit'].astype(int)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
RandomForestRegressor(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor(random_state=42)
# Predict and evaluate
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.2f}")
RMSE: 16.13 R²: 0.48
c:\Users\mail2\anaconda3\envs\MindLabs\Lib\site-packages\sklearn\metrics\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'. warnings.warn(
The results from the Random Forest model isn't very strong. The R squared value is 0.48 which is in the okay/acceptable range. Since popularity ranges from 0 to 100, an RMSE of 16.13 means you're off by ~16 units on average. Which even considering the large range is poor performance from the model. Below you can see the importance the model gave for each feature. A probable reason for the high error from the model could be because of the imbalance in the dataset with most of the songs with low popularity scores. A potential fix or step to take could be to change from a regression task to a classification task where the popularity of the song is group into low, medium, or high and the prediction is one of the 3 categories.
importances = model.feature_importances_
features = X.columns
sorted_idx = importances.argsort()
plt.figure(figsize=(10, 6))
sns.barplot(x=importances[sorted_idx], y=features[sorted_idx])
plt.title("Feature Importances")
plt.show()
Below I am creating the popularity classes.
spotify_df['popularity_class'] = pd.qcut(
spotify_df['popularity'],
q=3,
labels=['low', 'medium', 'high']
)
X = spotify_df[feature_cols]
y = spotify_df['popularity_class']
X['explicit'] = X['explicit'].astype(int)
C:\Users\mail2\AppData\Local\Temp\ipykernel_45916\2951831378.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X['explicit'] = X['explicit'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
# Train classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
from sklearn.metrics import classification_report
# Get the classification report as a dictionary
report_dict = classification_report(
y_test, y_pred, target_names=['low', 'medium', 'high'], output_dict=True
)
# Convert to DataFrame and drop 'accuracy', 'macro avg', 'weighted avg' rows if you only want per-class
report_df = pd.DataFrame(report_dict).transpose().round(2)
# Optionally drop support column if you just want metrics
metrics_to_plot = report_df.drop(['accuracy', 'macro avg', 'weighted avg'], errors='ignore')[['precision', 'recall', 'f1-score']]
# Plot the heatmap
plt.figure(figsize=(8, 4))
sns.heatmap(metrics_to_plot, annot=True, cmap='YlGnBu', fmt='.2f', cbar=False)
plt.title("Classification Report Heatmap", fontsize=14)
plt.xlabel("Metrics")
plt.ylabel("Classes")
plt.tight_layout()
plt.show()
🧠 Class-wise Interpretation¶
Low Popularity:
- Precision: 0.73 → When the model predicts "low", it's correct 73% of the time.
- Recall: 0.71 → It identifies 71% of actual "low" popularity songs.
- F1-score: 0.72 → Balanced performance.
Medium Popularity:
- Precision: 0.78, Recall: 0.80, F1-score: 0.79
- This class has the strongest performance overall.
High Popularity:
- Precision: 0.69, Recall: 0.68, F1-score: 0.68
- Slightly weaker, which is expected — hit songs are more nuanced.
📈 Overall Model Performance¶
- Accuracy: 0.73 → 73% of all predictions were correct.
- Macro Avg: 0.73 → Average score treating all classes equally.
- Weighted Avg: 0.73 → Average weighted by class size.
V. 🧠 Insights and Conclusions¶
What we might conclude from this exploration is that a song’s popularity cannot be judged straightforwardly from just one or two features like
🎵 tempo, ⚡ energy, 🎻 instrumentals, or even 🔞 explicitness.
➡️ What matters most is the combination of all these numeric features —
plus some key factors like:
- 🎼 Genre
- 👤 Artist
These two elements play a huge role in determining how popular a song becomes.
🎤 As expected, the artist really knows best what makes a hit — and that human creativity can’t be fully replaced by numbers.
Still, our model gives us an exciting way to explore what might make the next viral song!
Work for the other models for reference¶
Neural Network Section¶
Credits:
- HW4
- https://www.geeksforgeeks.org/deep-learning/how-to-implement-neural-networks-in-pytorch/
- https://docs.pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html
- https://docs.pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.StepLR.html
- https://stackoverflow.com/questions/42966393/is-it-good-learning-rate-for-adam-method
- https://colab.research.google.com/drive/188oDAlTSCycppp_ijjLFuyVwsWYLh1Mm?usp=sharing#scrollTo=2vsxhmnqwBtd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import matplotlib.pyplot as plt
import numpy as np
artists_popularity_map = spotify_df.groupby('artists')['popularity'].mean()
spotify_df['artists_target_enc'] = spotify_df['artists'].map(artists_popularity_map)
#encode explicit as 1 or 0
spotify_df['explicit_enc'] = spotify_df['explicit'].map(lambda x: 1 if x else 0)
#drop non-numeric columns
X = spotify_df.drop(columns=['popularity', 'artists', 'track_id', 'album_name', 'track_name', 'explicit', 'track_genre', 'track_genre_freq'])
y = spotify_df['popularity']
display(X.head())
display(y.head())
display(X.dtypes)
display(y.dtypes)
#Convert type of data into floats
X = X.astype('float32')
y = y.astype('float32')
display(X.dtypes)
display(y.dtypes)
random_state= 42
#Credit: HW4
#split into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
#Normalize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train)
y_train_tensor = torch.tensor(y_train.values)
X_test_tensor = torch.tensor(X_test)
y_test_tensor = torch.tensor(y_test.values)
# Create data loaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
batch_size = train_loader.batch_size
num_train_data = len(train_loader) * batch_size
num_test_data = len(test_loader) * batch_size
feat_dim = train_loader.dataset[0][0].shape[0]
| duration_ms | danceability | energy | key | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | time_signature | track_genre_target_enc | artists_target_enc | explicit_enc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 230666 | 0.676 | 0.4610 | 1 | -6.746 | 0 | 0.1430 | 0.0322 | 0.000001 | 0.3580 | 0.715 | 87.917 | 4 | 42.483 | 58.000000 | 0 |
| 1 | 149610 | 0.420 | 0.1660 | 1 | -17.235 | 1 | 0.0763 | 0.9240 | 0.000006 | 0.1010 | 0.267 | 77.489 | 4 | 42.483 | 42.923077 | 0 |
| 2 | 210826 | 0.438 | 0.3590 | 0 | -9.734 | 1 | 0.0557 | 0.2100 | 0.000000 | 0.1170 | 0.120 | 76.332 | 4 | 42.483 | 57.000000 | 0 |
| 3 | 201933 | 0.266 | 0.0596 | 0 | -18.515 | 1 | 0.0363 | 0.9050 | 0.000071 | 0.1320 | 0.143 | 181.740 | 3 | 42.483 | 53.933333 | 0 |
| 4 | 198853 | 0.618 | 0.4430 | 2 | -9.681 | 1 | 0.0526 | 0.4690 | 0.000000 | 0.0829 | 0.167 | 119.949 | 4 | 42.483 | 41.727273 | 0 |
0 73 1 55 2 57 3 71 4 82 Name: popularity, dtype: int64
duration_ms int64 danceability float64 energy float64 key int64 loudness float64 mode int64 speechiness float64 acousticness float64 instrumentalness float64 liveness float64 valence float64 tempo float64 time_signature int64 track_genre_target_enc float64 artists_target_enc float64 explicit_enc int64 dtype: object
dtype('int64')
duration_ms float32 danceability float32 energy float32 key float32 loudness float32 mode float32 speechiness float32 acousticness float32 instrumentalness float32 liveness float32 valence float32 tempo float32 time_signature float32 track_genre_target_enc float32 artists_target_enc float32 explicit_enc float32 dtype: object
dtype('float32')
try:
assert(type(train_loader) == DataLoader)
assert(type(test_loader) == DataLoader)
except:
print('"train_loader" and "test_loader" must be DataLoader type!')
print("Batch Size:", batch_size)
print("Number of Train Data:", num_train_data)
print("Number of Test Data:", num_test_data)
print("Feature Dimension:", feat_dim)
Batch Size: 128 Number of Train Data: 90880 Number of Test Data: 22784 Feature Dimension: 16
device = (
"cuda"
if torch.cuda.is_available()
else "mps"
if torch.backends.mps.is_available()
else "cpu"
)
print(f"Using {device} device")
Using mps device
class NeuralNetwork(nn.Module):
def __init__(self, feat_dim):
super().__init__()
### 3-layer fully connected neural network.
self.linear_relu_stack = nn.Sequential(
nn.Linear(feat_dim, 1024, dtype= torch.float32),
nn.ReLU(),
nn.Linear(1024, 512, dtype= torch.float32),
nn.ReLU(),
nn.Linear(512, 1, dtype= torch.float32),
#nn.Sigmoid()
)
def forward(self, x):
logits = self.linear_relu_stack(x)
return logits
# Instantiate the model
model = NeuralNetwork(feat_dim=feat_dim).to(device)
print(model)
NeuralNetwork(
(linear_relu_stack): Sequential(
(0): Linear(in_features=16, out_features=1024, bias=True)
(1): ReLU()
(2): Linear(in_features=1024, out_features=512, bias=True)
(3): ReLU()
(4): Linear(in_features=512, out_features=1, bias=True)
)
)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr= 5e-7)
scheduler = StepLR(optimizer, step_size=20, gamma=0.5)
epochs = 100
avg_losses = []
for epoch in range(epochs):
model.train()
running_loss = 0.0
for batch_idx, (data, targets) in enumerate(train_loader):
# Forward pass
optimizer.zero_grad()
data, targets = data.to(device), targets.to(device)
pred = model(data)
loss = criterion(pred, targets)
# Backward pass
loss.backward()
optimizer.step()
running_loss += loss.item() ** 0.5 #add MSE ** 0.5 for the batch
#get the average loss of this epoch over every batch
#len(train_loader) is the number of iterations or batches in the epoch
avg_losses.append(running_loss / len(train_loader))
scheduler.step()
print(f'Epoch {epoch+1}/{epochs}, Average MSE Loss in original units: {avg_losses[-1]}')
print('Finished Training')
/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/nn/modules/loss.py:610: UserWarning: Using a target size (torch.Size([128])) that is different to the input size (torch.Size([128, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size. return F.mse_loss(input, target, reduction=self.reduction) /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/nn/modules/loss.py:610: UserWarning: Using a target size (torch.Size([87])) that is different to the input size (torch.Size([87, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size. return F.mse_loss(input, target, reduction=self.reduction)
Epoch 1/100, Average MSE Loss in original units: 39.92520636839794 Epoch 2/100, Average MSE Loss in original units: 39.50686518096674 Epoch 3/100, Average MSE Loss in original units: 39.05217888917549 Epoch 4/100, Average MSE Loss in original units: 38.56087793677272 Epoch 5/100, Average MSE Loss in original units: 38.07235839347296 Epoch 6/100, Average MSE Loss in original units: 37.580691327247834 Epoch 7/100, Average MSE Loss in original units: 37.088776321011714 Epoch 8/100, Average MSE Loss in original units: 36.60467300129989 Epoch 9/100, Average MSE Loss in original units: 36.11216843774347 Epoch 10/100, Average MSE Loss in original units: 35.61935371054808 Epoch 11/100, Average MSE Loss in original units: 35.11754852204075 Epoch 12/100, Average MSE Loss in original units: 34.613947322476974 Epoch 13/100, Average MSE Loss in original units: 34.10113023973165 Epoch 14/100, Average MSE Loss in original units: 33.57869826697288 Epoch 15/100, Average MSE Loss in original units: 33.06935954887216 Epoch 16/100, Average MSE Loss in original units: 32.549160873178764 Epoch 17/100, Average MSE Loss in original units: 32.02299042641106 Epoch 18/100, Average MSE Loss in original units: 31.507790378209865 Epoch 19/100, Average MSE Loss in original units: 30.985146640796305 Epoch 20/100, Average MSE Loss in original units: 30.475354461395874 Epoch 21/100, Average MSE Loss in original units: 30.09046498954362 Epoch 22/100, Average MSE Loss in original units: 29.83375746973948 Epoch 23/100, Average MSE Loss in original units: 29.585399854512474 Epoch 24/100, Average MSE Loss in original units: 29.332904030094742 Epoch 25/100, Average MSE Loss in original units: 29.081692044774066 Epoch 26/100, Average MSE Loss in original units: 28.843510121167004 Epoch 27/100, Average MSE Loss in original units: 28.59885873931619 Epoch 28/100, Average MSE Loss in original units: 28.356046321282893 Epoch 29/100, Average MSE Loss in original units: 28.117990478980502 Epoch 30/100, Average MSE Loss in original units: 27.88769060661901 Epoch 31/100, Average MSE Loss in original units: 27.66195097483165 Epoch 32/100, Average MSE Loss in original units: 27.436862065079815 Epoch 33/100, Average MSE Loss in original units: 27.223354407607083 Epoch 34/100, Average MSE Loss in original units: 27.004532702980697 Epoch 35/100, Average MSE Loss in original units: 26.803662889542014 Epoch 36/100, Average MSE Loss in original units: 26.60419442634853 Epoch 37/100, Average MSE Loss in original units: 26.40331025616301 Epoch 38/100, Average MSE Loss in original units: 26.215506345024817 Epoch 39/100, Average MSE Loss in original units: 26.033998019237767 Epoch 40/100, Average MSE Loss in original units: 25.859277198161045 Epoch 41/100, Average MSE Loss in original units: 25.737548838015044 Epoch 42/100, Average MSE Loss in original units: 25.644827304417863 Epoch 43/100, Average MSE Loss in original units: 25.567906927014562 Epoch 44/100, Average MSE Loss in original units: 25.491375359734594 Epoch 45/100, Average MSE Loss in original units: 25.411843800771482 Epoch 46/100, Average MSE Loss in original units: 25.336501091073337 Epoch 47/100, Average MSE Loss in original units: 25.265102167132017 Epoch 48/100, Average MSE Loss in original units: 25.192427730523374 Epoch 49/100, Average MSE Loss in original units: 25.111967636897823 Epoch 50/100, Average MSE Loss in original units: 25.05274694251789 Epoch 51/100, Average MSE Loss in original units: 24.986491220470572 Epoch 52/100, Average MSE Loss in original units: 24.92460903355034 Epoch 53/100, Average MSE Loss in original units: 24.85779372011131 Epoch 54/100, Average MSE Loss in original units: 24.802410483419653 Epoch 55/100, Average MSE Loss in original units: 24.74470677288448 Epoch 56/100, Average MSE Loss in original units: 24.68993504147766 Epoch 57/100, Average MSE Loss in original units: 24.636219978374076 Epoch 58/100, Average MSE Loss in original units: 24.58432670458516 Epoch 59/100, Average MSE Loss in original units: 24.538592447322923 Epoch 60/100, Average MSE Loss in original units: 24.48919232471983 Epoch 61/100, Average MSE Loss in original units: 24.458720831369032 Epoch 62/100, Average MSE Loss in original units: 24.43170524370897 Epoch 63/100, Average MSE Loss in original units: 24.408261820494907 Epoch 64/100, Average MSE Loss in original units: 24.38279351241744 Epoch 65/100, Average MSE Loss in original units: 24.367187755006388 Epoch 66/100, Average MSE Loss in original units: 24.34592708132929 Epoch 67/100, Average MSE Loss in original units: 24.32201692581183 Epoch 68/100, Average MSE Loss in original units: 24.301245423231155 Epoch 69/100, Average MSE Loss in original units: 24.286866423825764 Epoch 70/100, Average MSE Loss in original units: 24.26453892518404 Epoch 71/100, Average MSE Loss in original units: 24.250377824354757 Epoch 72/100, Average MSE Loss in original units: 24.22640643253972 Epoch 73/100, Average MSE Loss in original units: 24.217306671073235 Epoch 74/100, Average MSE Loss in original units: 24.195026405639982 Epoch 75/100, Average MSE Loss in original units: 24.18100441059044 Epoch 76/100, Average MSE Loss in original units: 24.15859233125148 Epoch 77/100, Average MSE Loss in original units: 24.145951744885735 Epoch 78/100, Average MSE Loss in original units: 24.119596049336835 Epoch 79/100, Average MSE Loss in original units: 24.111905888842852 Epoch 80/100, Average MSE Loss in original units: 24.097380904017637 Epoch 81/100, Average MSE Loss in original units: 24.080247672727026 Epoch 82/100, Average MSE Loss in original units: 24.079482094928725 Epoch 83/100, Average MSE Loss in original units: 24.068945792325614 Epoch 84/100, Average MSE Loss in original units: 24.056763962203526 Epoch 85/100, Average MSE Loss in original units: 24.05788548925725 Epoch 86/100, Average MSE Loss in original units: 24.049042407991085 Epoch 87/100, Average MSE Loss in original units: 24.039438623204067 Epoch 88/100, Average MSE Loss in original units: 24.04029442534095 Epoch 89/100, Average MSE Loss in original units: 24.02420709041609 Epoch 90/100, Average MSE Loss in original units: 24.019393654402148 Epoch 91/100, Average MSE Loss in original units: 24.01398873077832 Epoch 92/100, Average MSE Loss in original units: 24.008107776343124 Epoch 93/100, Average MSE Loss in original units: 24.00188506096783 Epoch 94/100, Average MSE Loss in original units: 23.996122958773643 Epoch 95/100, Average MSE Loss in original units: 23.983494185770823 Epoch 96/100, Average MSE Loss in original units: 23.979227915414857 Epoch 97/100, Average MSE Loss in original units: 23.969764627942816 Epoch 98/100, Average MSE Loss in original units: 23.965654533243466 Epoch 99/100, Average MSE Loss in original units: 23.961272042122403 Epoch 100/100, Average MSE Loss in original units: 23.955823165180455 Finished Training
test_loss = 0.0
for batch_idx, (data, targets) in enumerate(test_loader):
data, targets = data.to(device), targets.to(device)
loss = criterion(model(data), targets)
test_loss += loss.item()
test_loss = test_loss / len(test_loader)
train_loss = 0.0
for batch_idx, (data, targets) in enumerate(train_loader):
data, targets = data.to(device), targets.to(device)
loss = criterion(model(data), targets)
train_loss += loss.item()
train_loss = train_loss / len(train_loader)
print("Train Loss (in original units square root of average MSE):", train_loss ** 0.5)
print("Test Loss (in original units square root of average MSE):", test_loss ** 0.5)
print("Coefficient of Determination of Test Data", 1 - test_loss/np.var(y_test))
plt.plot(avg_losses)
plt.title("Losses over Time")
plt.xlabel("Epoch")
plt.ylabel("Loss (MSE)")
plt.show()
/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/nn/modules/loss.py:610: UserWarning: Using a target size (torch.Size([54])) that is different to the input size (torch.Size([54, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size. return F.mse_loss(input, target, reduction=self.reduction)
Train Loss (in original units square root of average MSE): 23.991617163450787 Test Loss (in original units square root of average MSE): 24.131351592222824 Coefficient of Determination of Test Data -0.1608975
We see that the coefficient of determination is negative. This means that this model is worse at predicting popularity than the mean itself. Recall the formula for coefficient of determination is $$R^{2} = 1-\frac {\sum_{i} (y_{i}- \hat{y}_{i})^{2}} {\sum_{i} (y_{i}- \bar{y}_{i})^{2}} = 1- \frac {MSE(y)} {Var(y)}$$ We prefer that MSE is low, so our goal is for this coefficient to be as close to 1 as possible. Therefore, this is a poor model for predicting popularity.
spotify_df['popularity'].describe()
count 113549.000000 mean 33.324433 std 22.283855 min 0.000000 25% 17.000000 50% 35.000000 75% 50.000000 max 100.000000 Name: popularity, dtype: float64
## define code for visualizing the results
def ml_visual(sample, target, preds):
sample_df = pd.DataFrame(sample, columns= X.columns)
target_df = pd.DataFrame(target, columns= ['popularity'])
preds_df = pd.DataFrame(preds, columns= ['predicted_popularity'])
sample_df = sample_df.merge(target_df, left_index=True, right_index=True)
sample_df = sample_df.merge(preds_df, left_index=True, right_index=True)
display(sample_df.head(10))
plt.scatter(target, preds)
plt.xlabel('actual popularity')
plt.ylabel('predicted popularity')
plt.plot(target, target)
plt.show()
#Visualize the model accuracy with some predictions
sample, target = next(iter(test_loader))
sample = sample.to(device)
preds = model(sample)
sample = sample.cpu().numpy()
target = target.cpu().numpy()
preds = preds.cpu().detach().numpy()
ml_visual(sample, target, preds)
| duration_ms | danceability | energy | key | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | time_signature | track_genre_target_enc | artists_target_enc | explicit_enc | popularity | predicted_popularity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.182531 | -0.801713 | 0.246650 | 1.039759 | 0.245466 | 0.748811 | -0.335326 | 0.780587 | -0.503755 | 3.823774 | 0.866666 | -1.262497 | 0.225414 | 0.983683 | 0.613257 | -0.30251 | 45.0 | 35.856556 |
| 1 | -1.153896 | 0.494964 | 0.238694 | -0.930096 | 0.012982 | 0.748811 | -0.469085 | 0.651188 | -0.503750 | 2.060848 | 1.746144 | 1.203250 | 0.225414 | 0.496016 | 0.423640 | -0.30251 | 37.0 | 29.873962 |
| 2 | -0.489663 | 0.039686 | 1.332504 | -1.211504 | 0.841664 | -1.335451 | 0.918539 | -0.949616 | -0.431899 | 0.313806 | -0.618418 | -0.272638 | 0.225414 | -0.962407 | -0.453826 | -0.30251 | 17.0 | 26.164085 |
| 3 | 0.532040 | -1.643113 | -1.988699 | -0.367280 | -2.160372 | -1.335451 | -0.482557 | 1.993330 | 2.354289 | -0.580892 | -1.038870 | 0.441346 | -2.062984 | -0.374213 | -0.559937 | -0.30251 | 23.0 | 35.964867 |
| 4 | 0.578262 | -0.940025 | 1.113742 | 1.039759 | 1.089435 | 0.748811 | 0.254558 | -0.851651 | -0.503755 | -0.604716 | 0.380638 | 1.862487 | 0.225414 | -0.761678 | -0.675422 | -0.30251 | 25.0 | 29.396473 |
| 5 | -0.753662 | 1.100080 | 0.708038 | 1.039759 | 0.687204 | 0.748811 | 1.457422 | -0.633778 | -0.503755 | -0.804302 | 1.321834 | 2.269206 | 0.225414 | 0.018907 | -0.145209 | 3.30568 | 31.0 | 41.923275 |
| 6 | -0.403861 | 0.229865 | 0.843273 | -0.085872 | 0.723139 | -1.335451 | -0.426744 | -0.929501 | -0.503751 | 0.149689 | -0.548985 | -0.005563 | 0.225414 | 0.986089 | 1.345568 | -0.30251 | 59.0 | 25.103073 |
| 7 | -0.175806 | -0.928499 | -0.922732 | 0.476944 | 0.217274 | 0.748811 | -0.510463 | 0.082432 | -0.503755 | -0.083250 | -0.934721 | -0.074064 | -2.062984 | 0.443435 | 1.020097 | -0.30251 | 58.0 | 23.098797 |
| 8 | -0.426109 | 0.281732 | -0.473276 | 0.195536 | 0.378682 | 0.748811 | 0.254558 | 1.244018 | -0.503755 | -0.692597 | 0.993958 | 0.360062 | 0.225414 | -0.511088 | -0.223671 | -0.30251 | 29.0 | 20.275993 |
| 9 | 0.588058 | -1.406829 | 1.300684 | -1.211504 | 0.647299 | -1.335451 | 0.027457 | -0.949750 | 1.972353 | 0.504392 | -1.692307 | 0.271249 | 0.225414 | -0.096352 | 0.522406 | -0.30251 | 23.0 | 29.406485 |
You can tell that it's guessing around the mean 33 every time quite consistently. Something to try next would be to train the model without categorical variables: key, mode, time signature. However, considering the knowledge that our features don't have a clear linear relationship with the target, perhaps we can try K-Nearest Neighbors.
SVR (Support Vector Regressor)¶
Support Vector Regression is a type of Support Vector Machine used for regression analysis. This model can help us see if a song feature has a strong relationship with the target variable, popularity. In this example, we will use instrumentalness as the X-value and popularity as the Y-value, as instrumentalness is a continuous variable that has a slightly higher correlation with popularity.
For now, we will split the dataset into testing and training sets, and fit the data and model.
(Note: we are only testing the model on one x-variable because it takes a while to fit the data).
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
# Get the X and Y data
X = np.array(spotify_df['instrumentalness']).reshape(-1, 1)
Y = np.array(spotify_df['popularity'])
# Split dataset into testing and training sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
# Fit the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Initialize and fit the model
svr_model = SVR(kernel='linear')
svr_model.fit(X_train_scaled, Y_train)
# Make predictions on the test data
Y_pred = svr_model.predict(X_test_scaled)
Now let's graph and examine our model's performance.
# Model prediction for train dataset
Y_train_pred = svr_model.predict(X_train_scaled)
plt.scatter(X_train, Y_train)
plt.plot(X_train, Y_train_pred, color = 'orange', label = 'linear SVR')
plt.xlabel('Instrumentalness')
plt.ylabel('Popularity')
Text(0, 0.5, 'Popularity')
Now we can evaluate our linear model's perfomance, and plot the actual vs predicted target values.
from sklearn import metrics
# Calculate results
r2_score = round(metrics.r2_score(Y_test, Y_pred),2)
rmse = round(np.sqrt(metrics.mean_squared_error(Y_test, Y_pred)),2)
print(f'r2: {r2_score}')
print(f'rmse: {rmse}')
min_x = min(min(Y_pred), min(Y_test))
max_x = max(max(Y_pred), max(Y_test))
plt.scatter(Y_pred, Y_test)
plt.plot([min_x,max_x], [min_x,max_x], 'r--', label = '1:1')
plt.legend()
plt.xlabel('Prediction')
plt.ylabel('Actual')
r2: -0.0 rmse: 22.41
Text(0, 0.5, 'Actual')
Seems like the predictions of popularity scores based on instrumentalness are consistently lower than the actual values.
R^2 of 0.0 tells us that approximately 0% of the variance in valence can be explained by danceability.
Root mean squared error (RMSE) of 22.41 tells us the average magnitude of the prediction errors between the actual and predict valence scores.
So it looks like this model isn't the best when it comes to predicting popularity based off instrumentalness. Let's look at another model!
Linear Regression¶
Now we'll use a Linear Regression model to see the impact of individual continuous variables on popularity. For now, we will use instrumentalness, speechiness, loudness, and energy as X-values, and popularity as our Y-values. The first three X-values had the highest correlations with popularity, while the last one has one of the lowest correlations. We will plot each of those variables below:
import numpy as np
import matplotlib.pyplot as plt
# Get the data from the csv.
X_instrumentalness = np.array(spotify_df['instrumentalness']).reshape(-1, 1)
X_speechiness = np.array(spotify_df['speechiness']).reshape(-1, 1)
X_loudness = np.array(spotify_df['loudness']).reshape(-1, 1)
X_energy = np.array(spotify_df['energy']).reshape(-1, 1)
Y_popularity = np.array(spotify_df['popularity'])
# Plot each X-variable with popularity
plt.scatter(X_instrumentalness , Y_popularity)
plt.xlabel('Instrumentalness')
plt.ylabel('Popularity')
plt.title('Popularity as a function of Instrumentalness')
plt.show()
plt.scatter(X_speechiness, Y_popularity)
plt.xlabel('Speechiness')
plt.ylabel('Popularity')
plt.title('Popularity as a function of Speechiness')
plt.show()
plt.scatter(X_loudness, Y_popularity)
plt.xlabel('Loudness')
plt.ylabel('Popularity')
plt.title('Popularity as a function of Loudness')
plt.show()
plt.scatter(X_energy, Y_popularity)
plt.xlabel('Energy')
plt.ylabel('Popularity')
plt.title('Popularity as a function of Energy')
plt.show()
Wow, those are some crowded scatter plots! Anyways, we are now going to divide the data into training and testing sets.
from sklearn.model_selection import train_test_split
random_state = 42
np.random.seed(random_state)
test_size = 0.2
def split_data(X, Y, test_size=test_size, random_state=random_state):
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)
return X_train, X_test, Y_train, Y_test
X_train_instrumentalness, X_test_instrumentalness, Y_train_instrumentalness, Y_test_instrumentalness = split_data(X_instrumentalness, Y_popularity)
X_train_speechiness, X_test_speechiness, Y_train_speechiness, Y_test_speechiness = split_data(X_speechiness, Y_popularity)
X_train_loudness, X_test_loudness, Y_train_loudness, Y_test_loudness = split_data(X_loudness, Y_popularity)
X_train_energy, X_test_energy, Y_train_energy, Y_test_energy = split_data(X_energy, Y_popularity)
Let's visualize out split dataset.
def draw_scatter(X_train, Y_train, X_test, Y_test, title, xlabel='Feature', ylabel='Target'):
plt.scatter(X_train, Y_train, color='blue', label='Train', alpha=0.5)
plt.scatter(X_test, Y_test, color='red', label='Test', alpha=0.5)
plt.title(title)
plt.xlabel(xlabel)
if ylabel:
plt.ylabel(ylabel)
plt.legend()
plt.show()
draw_scatter(X_train_instrumentalness, Y_train_instrumentalness, X_test_instrumentalness, Y_test_instrumentalness, 'Popularity as a function of Instrumentalness')
draw_scatter(X_train_speechiness, Y_train_speechiness, X_test_speechiness, Y_test_speechiness, 'Popularity as a function of Speechiness')
draw_scatter(X_train_loudness, Y_train_loudness, X_test_loudness, Y_test_loudness, 'Popularity as a function of Loudness')
draw_scatter(X_train_energy, Y_train_energy, X_test_energy, Y_test_energy, 'Popularity as a function of energy')
Now we will fit a line for the given datasets.
from sklearn.linear_model import LinearRegression
# Function to fit the model
def fit_model(X_train, Y_train):
model = LinearRegression()
model.fit(X_train, Y_train)
return model
# Function to predict using the fitted model
def predict_data(model, X_train, X_test):
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)
return Y_train_pred, Y_test_pred
model_instrumentalness = fit_model(X_train_instrumentalness, Y_train_instrumentalness)
Y_train_pred_instrumentalness, Y_test_pred_instrumentalness = predict_data(
model_instrumentalness, X_train_instrumentalness, X_test_instrumentalness)
model_speechiness = fit_model(X_train_speechiness, Y_train_speechiness)
Y_train_pred_speechiness, Y_test_pred_speechiness = predict_data(
model_speechiness, X_train_speechiness, X_test_speechiness)
model_loudness = fit_model(X_train_loudness, Y_train_loudness)
Y_train_pred_loudness, Y_test_pred_loudness = predict_data(
model_loudness, X_train_loudness, X_test_loudness)
model_energy = fit_model(X_train_energy, Y_train_energy)
Y_train_pred_energy, Y_test_pred_energy = predict_data(
model_energy, X_train_energy, X_test_energy)
Now we plot the scatter plot with the regression line
def draw_scatter_with_regression(X_train, Y_train, X_test, Y_test, Y_train_pred, title, xlabel='Feature', ylabel='Target'):
plt.scatter(X_train, Y_train, color='blue', label='Train')
plt.scatter(X_test, Y_test, color='orange', label='Test')
plt.plot(X_train, Y_train_pred, color='green', label='Regression Line')
plt.title(title)
plt.xlabel(xlabel)
if ylabel:
plt.ylabel(ylabel)
plt.legend()
plt.show()
draw_scatter_with_regression(X_train_instrumentalness, Y_train_instrumentalness,
X_test_instrumentalness, Y_test_instrumentalness,
Y_train_pred_instrumentalness,
'Popularity as a function of Instrumentalness')
draw_scatter_with_regression(X_train_speechiness, Y_train_speechiness,
X_test_speechiness, Y_test_speechiness,
Y_train_pred_speechiness,
'Popularity as a function of Speechiness')
draw_scatter_with_regression(X_train_loudness, Y_train_loudness,
X_test_loudness, Y_test_loudness,
Y_train_pred_loudness,
'Popularity as a function of Loudness')
draw_scatter_with_regression(X_train_energy, Y_train_energy, X_test_energy,
Y_test_energy, Y_train_pred_energy,
'Popularity as a function of Energy')
Well, it doesn't look like the models performed well.
All of the MSE values for the models are high, indicating that our models are making large errors when using their respective variables to predict popularity.
Since the all the R^2 coefficients are close to 0, that means there is no linear relationship between the observed and predicted values. So instrumentalness, speechiness, loudness, and energy do not have a strong relationship with popularity, though instrumentalness does have the strongest relationship by far.
It seems that this model isn't the best one to use to predict popularity. Maybe there is a better model?
XGBoost¶
df = pd.read_csv("dataset.csv")
spotify_df = df.drop(columns='Unnamed: 0')
spotify_df = spotify_df.dropna()
spotify_df = spotify_df.drop_duplicates()
freq_encoding = spotify_df['track_genre'].value_counts()
spotify_df['track_genre_freq'] = spotify_df['track_genre'].map(freq_encoding)
artist_encoding = spotify_df['artists'].value_counts()
spotify_df['track_artist_freq'] = spotify_df['artists'].map(artist_encoding)
spotify_df["explicitness_number"] = spotify_df["explicit"].apply(lambda x: 1 if x else 0)
spotify_df.sample(20)
numerical_df = spotify_df.drop(columns = ["track_id", "artists", "album_name", "track_name", "explicit", "track_genre"])
spotify_df["tryout"] = np.log(spotify_df["instrumentalness"] + 1) + np.log(spotify_df["loudness"] * - 1) - (spotify_df["track_genre_freq"])
#spotify_df = spotify_df.drop(columns = ["energy", "acousticness"])
#spotify_df = spotify_df.drop(columns = ["loudness"])
#spotify_df = spotify_df.drop(columns = ["valence"])
# Energy and acousticness and loudness are highly correlated with instrumentalness and instrumentalness is more correlated with popularity so its kept
#Valence is highly correlated wtih danceabiliy and valence has no correlation with popularity so we get rid of it
c:\Users\mail2\anaconda3\envs\MindLabs\Lib\site-packages\pandas\core\arraylike.py:396: RuntimeWarning: invalid value encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
X = numerical_df.drop(columns = ["popularity"])
y = numerical_df["popularity"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)
#Source: https://medium.com/@bharataameriya/using-xgboost-in-pytorch-for-enhanced-model-performance-d4c9f9e10225
train_dmatrix = xgb.DMatrix(X_train, label=y_train)
test_dmatrix = xgb.DMatrix(X_test, label=y_test)
# Define model parameters
params = {
"objective": "reg:squarederror",
"eval_metric": "rmse",
"max_depth": 23,
"eta": 0.1
}
# Train XGBoost model
xgb_model = xgb.train(params, train_dmatrix, num_boost_round=100)
preds = xgb_model.predict(test_dmatrix)
preds_torch = torch.tensor(preds, dtype=torch.float32)
rmse = np.sqrt(mean_squared_error(y_test, preds))
r2 = r2_score(y_test, preds)
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")
Root Mean Squared Error (RMSE): 18.9925 R² Score: 0.2809
plt.figure(figsize=(6, 4))
plt.hist(residuals, bins=30, edgecolor='k', alpha=0.7)
plt.title("Prediction Errors Distribution")
plt.xlabel("Prediction Error")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()
xgb.plot_importance(xgb_model, importance_type='gain', max_num_features=20)
plt.title('Top 10 Feature Importances')
plt.show()